Import useful stuff and define ancillary functions


In [38]:
%pylab inline
%load_ext autoreload
%autoreload 2

from __future__ import division

from collections import defaultdict, namedtuple
from datetime import datetime, timedelta
from functools import partial
import inspect
import json
import os
import re
import sys
import cPickle as pickle

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder


def analyze_str_columns(cols, df, only_percent=False):
    print 'Total samples: %s' % len(df)

    for c in cols:
        print '##############################'
        VAR_df = df[[c, 'target']]
        unique_vals = VAR_df[c].unique()
        # NaNs are the only floats among the values
        non_nan = [v for v in unique_vals if type(v) == str]
        str_0 = []
        str_1 = []
        col_names = []
        for u in unique_vals:
            if type(u) == str:
                col_mask = (VAR_df[c] == u)
            else:
                col_mask = VAR_df[c].isnull()
            str_0.append(len(VAR_df[col_mask & (VAR_df['target'] == 0)]))
            str_1.append(len(VAR_df[col_mask & (VAR_df['target'] == 1)]))

            col_names.append('%s_%s'%(c,u))
        VAR_df_counts = pd.DataFrame([str_0, str_1],
                                     columns=col_names,
                                     index=pd.Index([0, 1], name='target'))
        if not only_percent:
            print "------Counts-------"
            print VAR_df_counts
        print "----Percentages----"
        print VAR_df_counts/VAR_df_counts.sum()*100


Populating the interactive namespace from numpy and matplotlib
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
WARNING: pylab import has clobbered these variables: ['datetime']
`%matplotlib` prevents importing * from pylab and numpy

Load train data

Using pandas' read_csv with all the defaults


In [2]:
if os.name == 'nt':
    TRAIN_PATH = r'D:\train.csv'
    PTRAIN_PATH = r'D:\train_preprocessed_float_string.csv'
    TEST_PATH = r'D:\test.csv'
    GOOGNEWS_PATH = r'D:\GoogleNews-vectors-negative300.bin.gz'
    VOCAB_PATH = r'D:\big.txt'
else:
    TRAIN_PATH = r'/media/mtambos/speedy/train.csv'
    PTRAIN_PATH = r'/media/mtambos/speedy/train_preprocessed_float_string.csv'
    TEST_PATH = r'/media/mtambos/speedy/test.csv'
    GOOGNEWS_PATH = r'/media/mtambos/speedy/GoogleNews-vectors-negative300.bin.gz'
    VOCAB_PATH = r'/media/mtambos/speedy/big.txt'
df = pd.read_csv(PTRAIN_PATH, index_col="ID")


/home/mtambos/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1170: DtypeWarning: Columns (8,9,10,11,12,23,137,173,186,194,197,198,200,204,207) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)

Define columns


In [3]:
str_cols = [u'VAR_0001', u'VAR_0005', u'VAR_0044',
            u'VAR_0200', u'VAR_0202', u'VAR_0214',
            u'VAR_0216', u'VAR_0222', u'VAR_0237',
            u'VAR_0274', u'VAR_0283', u'VAR_0305',
            u'VAR_0325', u'VAR_0342', u'VAR_0352',
            u'VAR_0353', u'VAR_0354', u'VAR_0404',
            u'VAR_0466', u'VAR_0467', u'VAR_0493',
            u'VAR_1934']
try:
    str_cols = [c for c in str_cols if c in df.columns and df[c].dtype==np.object]
except NameError:
    pass

See if the classes are skewed


In [5]:
neg_samples_count = len(df['target'][df['target']==0])
pos_samples_count = len(df['target'][df['target']==1])
print '%s negative samples; %.2f%% of total' % (neg_samples_count, neg_samples_count/len(df)*100)
print '%s positive samples; %.2f%% of total' % (pos_samples_count, pos_samples_count/len(df)*100)


111458 negative samples; 76.75% of total
33773 positive samples; 23.25% of total

Cast string columns as string and make 'null' data uniform (instead of nan, -1, [], etc.)


In [4]:
def filter_str(str_cell):
    str_cell = re.sub(r'[\W_]+', ' ', str(str_cell))
    str_cell = str_cell.strip().lower()
    if str_cell in ('1', '-1', '[]', 'nan', ''):
        return None
    else:
        return str_cell

df[str_cols] = df[str_cols].astype(np.str).applymap(filter_str)
df[str_cols]


Out[4]:
VAR_0001 VAR_0005 VAR_0044 VAR_0200 VAR_0202 VAR_0214 VAR_0216 VAR_0222 VAR_0237 VAR_0274 ... VAR_0325 VAR_0342 VAR_0352 VAR_0353 VAR_0354 VAR_0404 VAR_0466 VAR_0467 VAR_0493 VAR_1934
ID
2 h c None ft lauderdale batchinquiry None ds c6 fl fl ... None cf o u o chief executive officer None None community association manager iaps
4 h b None santee batchinquiry None ds c6 ca mi ... h ec o r r None i discharged None iaps
5 h c None reedsville batchinquiry None ds c6 wv wv ... r uu r r None None None None None iaps
7 h c None liberty batchinquiry None ds c6 tx tx ... h None r r None None None None None rcc
8 r n None frankfort batchinquiry None ds c6 il il ... s None r u o None None None None branch
14 r c None spring batchinquiry None ds c6 tx me ... h fe u r r None None None None iaps
16 h c None gresham batchinquiry None ds c6 or ca ... s dc o o o None None None None iaps
20 r b None warner robins batchinquiry None ds c6 ga sc ... s ff u o r None None None None iaps
21 r n None san antonio batchinquiry None ds c6 tx tx ... s None r u u None None None None mobile
22 r n None norristown batchinquiry None ds c6 pa pa ... s ee u u u None None None None branch
23 r c None murfreesboro batchinquiry None ds c6 tn tn ... h fe o r r None None None None branch
24 h c None cartersville batchinquiry None ds c6 ga ga ... s None r o u None i dismissed None branch
25 h b None bethlehem batchinquiry None ds c6 pa pa ... s None u o u None i discharged None iaps
26 r b None gig harbor batchinquiry None ds c6 wa ca ... s bc u u o None None None None branch
28 q b None bakersfield batchinquiry None ds c6 ca ca ... s None r o u None None None None iaps
30 r b None herndon batchinquiry None ds c6 va va ... h fe r r u None None None None iaps
31 h c None shelby gap batchinquiry None ds c6 ky ky ... s None r o o None None None None branch
32 r n None ontario batchinquiry None ds c6 ca al ... h ee r r r None None None None iaps
35 h c None memphis batchinquiry None ds c6 tn tn ... s fe o u u None None None None iaps
36 r b None bakersfied batchinquiry None ds c6 ca ca ... h ff u r u None None None None branch
37 r b None altamonte springs batchinquiry None ds c6 fl nc ... h ef r r u None None None None iaps
38 h b None barnesville batchinquiry None ds c6 pa pa ... r fd o r u None i discharged None branch
40 h b None houston batchinquiry None ds c6 tx tx ... s ff u u u conta None None None iaps
42 r n None peru batchinquiry None ds c6 in in ... s ff u u None None None None None branch
43 r b None tacoma batchinquiry None ds c6 wa ga ... s ee r u None None None None None iaps
46 r c None ozark batchinquiry None ds c6 mo mo ... s uu r u None None None None None iaps
50 r c None riverside batchinquiry None ds c6 ca ca ... None None o None None None None None None branch
51 r b None augusta batchinquiry None ds c6 ga ga ... s ff u u u None None None None iaps
52 r b None chicago batchinquiry None ds c6 il mn ... h ed r r o contact None None None iaps
54 r b None sevierville batchinquiry None ds c6 tn mi ... s dd u u u None None None None branch
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
290409 h n None las cruces batchinquiry None ds c6 nm il ... s ff r o None None None None None branch
290412 r c None martinsville batchinquiry None ds c6 in fl ... f ce r r None None i discharged None iaps
290414 r s None bakersfield batchinquiry None ds c6 ca ca ... s None u u u None None None None branch
290415 r c None la porte batchinquiry None ds c6 in in ... h fe u r u None i discharged None branch
290417 r s None newport news batchinquiry None ds c6 va md ... s fe u u r None None None None branch
290424 h b None englewood batchinquiry None ds c6 tn tn ... s ff o o o None None None None branch
290426 h c None warfield batchinquiry None ds c6 va va ... s uu u o r None i discharged nurse aide branch
290427 h c None malden batchinquiry None ds c6 mo mo ... s None u u r None None None None iaps
290429 r b None marietta batchinquiry None ds c6 ga ct ... p dd u r u None None None None iaps
290431 r c None prospect batchinquiry None ds c6 oh oh ... h fe o r o None None None None iaps
290432 r c None olive branch batchinquiry None ds c6 ms ms ... h ff r r u None None None None branch
290434 r b None san diego batchinquiry None ds c6 ca tx ... h ed u r u None None None None branch
290436 h b None summerville batchinquiry None ds c6 ga ga ... s ff u o r None i dismissed None iaps
290439 r b None alice batchinquiry None ds c6 tx tx ... p uu u r r None None None None branch
290440 r c None pittsvile batchinquiry None ds c6 va va ... h fd u r u None None None None branch
290441 r b None nashville batchinquiry None ds c6 nc nc ... h fe u r u None None None None iaps
290443 r b None wyncote batchinquiry None ds c6 pa pa ... None dc o u u secretary None None None branch
290445 r c None indianapolis batchinquiry None ds c6 in in ... s ef u u u None None None None branch
290447 r c None lagrande batchinquiry None ds c6 or pa ... s dd u u u None i discharged None iaps
290448 r c None commerce batchinquiry None ds c6 tx tx ... None ff o u r contact None None None iaps
290449 r b None thornville batchinquiry None ds c6 oh oh ... s None u o r None None None None iaps
290450 r n None new braunfels batchinquiry None ds c6 tx tx ... s dd u u u None None None None iaps
290452 h b None bethel batchinquiry None ds c6 oh oh ... p ff o r u None None None None branch
290453 r b None indianapolis batchinquiry None ds c6 in il ... h ff u r None None None None None iaps
290454 r n None katy batchinquiry None ds c6 tx tx ... s dd u u o None None None None iaps
290457 h c None lafayette batchinquiry None ds c6 la la ... s None r u r None None None None branch
290458 r c None gardena batchinquiry None ds c6 ca ca ... s be r u r None i discharged None iaps
290459 r b None covina batchinquiry None ds c6 ca ca ... p cc u r None None None None None branch
290461 h c None batavia batchinquiry None ds c6 oh None ... None None None None None None None None None iaps
290463 h b None sherman oaks batchinquiry None ds c6 ca ca ... None None None None None None None None None iaps

145231 rows × 22 columns

Vectorize String and Datetime colums

String columns

See how many different values the string columns have


In [35]:
str_desc = df[str_cols].describe()
str_desc = pd.DataFrame(str_desc, columns=sorted(str_desc.columns, key=lambda c: str_desc.loc['std', c]))
str_desc


Out[35]:
VAR_0214 VAR_0005 VAR_0467 VAR_0353 VAR_0352 VAR_0001 VAR_1934 VAR_0354 VAR_0283 VAR_0305 VAR_0325 VAR_0237 VAR_0342 VAR_0274 VAR_0493 VAR_0404 VAR_0200
count 145231.000000 145231.000000 145231.000000 145231.000000 145231.000000 145231.000000 145231.000000 145231.000000 145231.000000 145231.000000 145231.000000 145231.000000 145231.000000 145231.000000 145231.000000 145231.000000 145231.00000
mean 0.000537 0.662001 0.321880 2.182723 2.008228 1.172580 1.125985 2.015279 4.273241 5.047738 5.362078 23.839256 26.243385 27.826828 20.664059 75.749227 5948.37755
std 0.066898 0.755864 0.756309 0.918042 0.928806 0.983218 1.096356 1.123611 1.432471 1.739619 2.385425 13.322850 15.853942 16.728497 87.876039 278.754038 3487.11275
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000
25% 0.000000 0.000000 0.000000 1.000000 1.000000 0.000000 0.000000 1.000000 5.000000 5.000000 3.000000 11.000000 17.000000 12.000000 0.000000 0.000000 2804.00000
50% 0.000000 1.000000 0.000000 2.000000 2.000000 2.000000 2.000000 2.000000 5.000000 6.000000 7.000000 26.000000 32.000000 28.000000 0.000000 0.000000 5894.00000
75% 0.000000 1.000000 0.000000 3.000000 3.000000 2.000000 2.000000 3.000000 5.000000 6.000000 7.000000 36.000000 40.000000 41.000000 0.000000 0.000000 8983.00000
max 12.000000 3.000000 3.000000 3.000000 3.000000 2.000000 4.000000 3.000000 6.000000 7.000000 8.000000 45.000000 49.000000 56.000000 602.000000 1787.000000 12275.00000

Column VAR_0044 has not a single value, drop it.


In [25]:
df.drop('VAR_0044', axis=1, inplace=True)
str_cols.remove('VAR_0044')

Columns VAR_0202, VAR_0216, VAR_0222 and VAR_0466 have only one value. Check if there's some correlation between the values and the target.

Replace their string values for 1 if there was something in the cell, or 0 if there wasn't.


In [26]:
analyze_str_columns(['VAR_0202', 'VAR_0216', 'VAR_0222', 'VAR_0466'], df)


Total samples: 145231
##############################
------Counts-------
        VAR_0202_batchinquiry  VAR_0202_None
target                                      
0                      111415             43
1                       33760             13
----Percentages----
        VAR_0202_batchinquiry  VAR_0202_None
target                                      
0                   76.745307      76.785714
1                   23.254693      23.214286
##############################
------Counts-------
        VAR_0216_ds  VAR_0216_None
target                            
0            111415             43
1             33760             13
----Percentages----
        VAR_0216_ds  VAR_0216_None
target                            
0         76.745307      76.785714
1         23.254693      23.214286
##############################
------Counts-------
        VAR_0222_c6  VAR_0222_None
target                            
0            111415             43
1             33760             13
----Percentages----
        VAR_0222_c6  VAR_0222_None
target                            
0         76.745307      76.785714
1         23.254693      23.214286
##############################
------Counts-------
        VAR_0466_None  VAR_0466_i
target                           
0               94671       16787
1               27708        6065
----Percentages----
        VAR_0466_None  VAR_0466_i
target                           
0           77.358861   73.459653
1           22.641139   26.540347

The values of these columns seem to be distributed according to the same distribution in the target column, so they're useless.


In [27]:
cols = ['VAR_0202', 'VAR_0216', 'VAR_0222', 'VAR_0466']
df.drop(cols, axis=1, inplace=True)
for c in cols:
    str_cols.remove(c)
del cols

Encode the labels of the rest of the columns


In [33]:
encoder = LabelEncoder()
for col in str_cols:
    df[col] = encoder.fit_transform(df[col])

Save preprocessed data to another csv file


In [36]:
df.to_csv(PTRAIN_PATH)

In [39]:
with open('deleted_str_cols.pickle', 'wb') as fp:
    pickle.dump(['VAR_0044', 'VAR_0202', 'VAR_0216', 'VAR_0222', 'VAR_0466'], fp)